Scripts to run CAVIAR, FINEMAP (version 1.1) and DAP-G

# knitr::opts_chunk$set(error = TRUE) # Knit even with errors

# devtools::install_local("geneRefineR/", force = T)
# library("echolocatoR")
source("echolocatoR/R/MAIN.R") 

library(readxl)
library(DT)
library(data.table)
library(dplyr)
library(ggplot2)
library(plotly)
library(cowplot)
library(ggrepel)
library(curl)
library(biomaRt) 

# Ensembl LD API
# library(httr)
# library(jsonlite)
# library(xml2)
# library(gaston)
# library(RCurl) 
# *** susieR ****
# library(knitrBootstrap) #install_github('jimhester/knitrBootstrap')
library(susieR) # devtools::install_github("stephenslab/susieR")
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.14.4
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] susieR_0.6.2.0411  biomaRt_2.38.0     tidyr_0.8.3       
##  [4] gaston_1.5.5       RcppParallel_4.4.2 Rcpp_1.0.1        
##  [7] curl_3.3           ggrepel_0.8.0      cowplot_0.9.4     
## [10] plotly_4.9.0       ggplot2_3.1.1      dplyr_0.8.0.1     
## [13] data.table_1.12.2  DT_0.5.2           readxl_1.3.1      
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-38      prettyunits_1.0.2    assertthat_0.2.1    
##  [4] digest_0.6.18        R6_2.4.0             cellranger_1.1.0    
##  [7] plyr_1.8.4           stats4_3.5.1         RSQLite_2.1.1       
## [10] evaluate_0.13        httr_1.4.0           pillar_1.3.1        
## [13] rlang_0.3.4          progress_1.2.0       lazyeval_0.2.2      
## [16] blob_1.1.1           S4Vectors_0.20.1     Matrix_1.2-17       
## [19] rmarkdown_1.12       stringr_1.4.0        htmlwidgets_1.3     
## [22] RCurl_1.95-4.12      bit_1.1-14           munsell_0.5.0       
## [25] compiler_3.5.1       xfun_0.6             pkgconfig_2.0.2     
## [28] BiocGenerics_0.28.0  htmltools_0.3.6      tidyselect_0.2.5    
## [31] tibble_2.1.1         expm_0.999-4         IRanges_2.16.0      
## [34] XML_3.98-1.19        viridisLite_0.3.0    crayon_1.3.4        
## [37] withr_2.1.2          bitops_1.0-6         grid_3.5.1          
## [40] jsonlite_1.6         gtable_0.3.0         DBI_1.0.0           
## [43] magrittr_1.5         scales_1.0.0         stringi_1.4.3       
## [46] tools_3.5.1          bit64_0.9-7          Biobase_2.42.0      
## [49] glue_1.3.1           purrr_0.3.2          hms_0.4.2           
## [52] parallel_3.5.1       yaml_2.2.0           AnnotationDbi_1.44.0
## [55] colorspace_1.4-1     memoise_1.1.0        knitr_1.22
print(paste("susieR ", packageVersion("susieR")))  
## [1] "susieR  0.6.2.411"
# MINERVA PATHS TO SUMMARY STATISTICS DATASETS: 
Data_dirs <- read.csv("./Data/directories_table.csv")

# Direct to local data files if not working on Minerva
if(getwd()=="/Users/schilder/Desktop/Fine_Mapping"){
  vcf_folder = F # Use internet if I'm working from my laptop
} else{
  vcf_folder = F#"../1000_Genomes_VCFs/Phase1" # Use previously downloaded files if working on Minerva node
}  

force_new_subset = F
allResults <- list()

Parkinson’s Disease

Nalls et. al. (2019) w/ 23andMe

Rare coding variant burden analysis:
We performed kernel-based burden tests on the 113 genes in our PD GWAS loci that contained two or more rare coding variants (MAF< 5% or MAF < 1%). After Bonferroni correction for 113 genes, we identified 7 significant putatively causal genes: - LRRK2, GBA, CATSPER3 (rs11950533/C5orf24 locus) - LAMB2 (rs12497850/IP6K2 locus) - LOC442028 (rs2042477/KCNIP3 locus) - NFKB2 (rs10748818/GBF1 locus), and SCARB2 (rs6825004 locus).”

– from Nalls et al. (2019)

  • maxPos for LRRK2 fine-mapping was set to 40,936,639, which is the the nearest downstream non-LRRK2-associated locus (CNTN1)
dataset_name <- "Nalls23andMe_2019"

top_SNPs <- Nalls_top_SNPs <- import_topSNPs(
  file_path = Directory_info(Data_dirs, dataset_name, "topSumStats"),
  chrom_col = "CHR", position_col = "BP", snp_col="SNP",
  pval_col="P, all studies", effect_col="Beta, all studies", gene_col="Nearest Gene",
  caption= "Nalls et al. (2018) w/ 23andMe PD GWAS Summary Stats")

# Manually add new SNP of interest
top_SNPs <- rbind(top_SNPs, 
                  data.table::data.table(Coord="14:54893485", CHR=14, POS=54893485, SNP="rs3783642", P=NA, Effect=NA, Gene="ATG14"))


gene_list <- c("LRRK2", "TMEM163", "GPNMB", "CTSB", "ATG14")# "LRRK2_alt","SNCA","VPS13C","C5orf24","IP6K2","KCNIP3","GBF1","SCARB2")#, top_SNPs$Gene) %>% unique()

allResults[[dataset_name]] <- finemap_geneList(top_SNPs,
                                               geneList=gene_list, 
                                               dataset_name = dataset_name, 
                                               dataset_type = "GWAS",
                                               query_by ="coordinates",
                                               subset_path = "auto",
                                               finemap_method = "susie", 
                 # file_path = Directory_info(dataset_name, "fullSumStats"),
                 fullSS_path = "Data/GWAS/Nalls23andMe_2019/nallsEtAl2019_allSamples_allVariants.mod.txt",  
                 chrom_col = "CHR", position_col = "POS", snp_col = "RSID",
                 pval_col = "p", effect_col = "beta", stderr_col = "se", MAF_col = "MAF",
                 vcf_folder = vcf_folder, superpopulation = "EUR", force_new_subset = F,
                 LD_block = F, block_size = .7, 
                 remove_variants=c("rs34637584"),
                 num_causal = 1)

LRRK2

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/GWAS/Nalls23andMe_2019/LRRK2/LRRK2_Nalls23andMe_2019_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Previously computed LD Matrix identified. Importing… ++++++++++

++++++++++ LD matrix calculated in 1.16 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-3757.22430938018” [1] “objective:-3754.9881063744” [1] “objective:-3754.98437444501” [1] “objective:-3754.98436821551” [1] “objective:-3754.98436820552” [1] “objective:-3754.98436820551”

  • Formatting susieR results
  • Identifying Credible Set…

****** 6 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 11 seconds.

LRRK2 fine-mapped in 18.26 seconds

TMEM163

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/GWAS/Nalls23andMe_2019/TMEM163/TMEM163_Nalls23andMe_2019_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/TMEM163_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 0.43 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-2653.78595588928” [1] “objective:-2653.37289639925” [1] “objective:-2653.37264152331” [1] “objective:-2653.3726413677”

  • Formatting susieR results
  • Identifying Credible Set…

****** 1 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 4 seconds.

TMEM163 fine-mapped in 9.97 seconds

GPNMB

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/GWAS/Nalls23andMe_2019/GPNMB/GPNMB_Nalls23andMe_2019_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/GPNMB_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 0.62 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-3671.85545167874” [1] “objective:-3670.77496658198” [1] “objective:-3670.77321466314” [1] “objective:-3670.77321184107” [1] “objective:-3670.77321183669” [1] “objective:-3670.77321183668”

  • Formatting susieR results
  • Identifying Credible Set…

****** 33 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 8 seconds.

GPNMB fine-mapped in 14.62 seconds

CTSB

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/GWAS/Nalls23andMe_2019/CTSB/CTSB_Nalls23andMe_2019_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/CTSB_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 0.83 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-4457.84901816636” [1] “objective:-4457.53258739532” [1] “objective:-4457.5323407588” [1] “objective:-4457.53234056553”

  • Formatting susieR results
  • Identifying Credible Set…

****** 30 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 15 seconds.

CTSB fine-mapped in 21.85 seconds

ATG14

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/GWAS/Nalls23andMe_2019/ATG14/ATG14_Nalls23andMe_2019_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/ATG14_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 0.74 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-3469.47929863011” [1] “objective:-3469.05590716079” [1] “objective:-3469.0550645008” [1] “objective:-3469.05506281449”

  • Formatting susieR results
  • Identifying Credible Set…

****** 11 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 8 seconds.

## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text_repel).

ATG14 fine-mapped in 14.65 seconds

cred_set <- subset(allResults[[dataset_name]], credible_set==T & Gene=="LRRK2")

manhattan_plot(# subset_path= get_subset_path(results_path = Directory_info(Data_dirs, dataset_name, "fullSumStats"), gene="LRRK2"),
               subset_path="./Data/GWAS/Nalls23andMe_2019/LRRK2/LRRK2_Nalls23andMe_2019_subset.txt",
               gene="LRRK2", SNP_list = c("rs76904798","rs34637584","rs117073808"), alt_color_SNPs=c("rs34637584"))

eQTL

  • AFR = AFA = African
  • AMR = Ad Mixed American
  • EAS = East Asian
  • EUR = CAU = European
  • SAS = South Asian
  • HIS = Hispanic

MESA - AFA

dataset_name <- "MESA_AFA" 

allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  
                                                superpopulation = "AFA",
                                                dataset_name = dataset_name, 
                                                dataset_type = "eQTL",
                                                top_SNPs = Nalls_top_SNPs, 
                                                query_by = "gene",
                   # fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   fullSS_path = "Data/eQTL/MESA_AFA/LRRK2_AFA_MESA.txt",
                   subset_path = "auto",
                   chrom_col = "chr", position_col = "pos_snps", snp_col="snps",
                   pval_col="pvalue", effect_col="beta", gene_col="gene_name", 
                   stderr_col = "calculate",  tstat_col = "statistic",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = F,
                   LD_block = F)

LRRK2

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/eQTL/MESA_AFA/LRRK2/LRRK2_MESA_AFA_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 1.53 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-5763.20635450473” [1] “objective:-5763.20613841342” [1] “objective:-5763.20613798223” [1] “objective:-5763.20613798137”

  • Formatting susieR results
  • Identifying Credible Set…

— ERROR — ****** Could NOT identify credible set. Default to SNPs with the top 5 PIPs ****** + Fine-mapping with ‘susie’ completed in 30 seconds.

LRRK2 fine-mapped in 36.89 seconds

MESA - CAU

dataset_name <- "MESA_CAU"

allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  
                                                superpopulation = "CAU",
                                                dataset_name = dataset_name, 
                                                dataset_type = "eQTL",
                                                top_SNPs = Nalls_top_SNPs, 
                                                query_by = "gene",
                   # fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   fullSS_path = "Data/eQTL/MESA_CAU/LRRK2_CAU_MESA.txt",
                   subset_path = "auto",
                   chrom_col = "chr", position_col = "pos_snps", snp_col="snps",
                   pval_col="pvalue", effect_col="beta", gene_col="gene_name", 
                   stderr_col = "calculate",  tstat_col = "statistic",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = F,
                   LD_block = F)

LRRK2

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/eQTL/MESA_CAU/LRRK2/LRRK2_MESA_CAU_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 0.93 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-3771.01872348208” [1] “objective:-3770.81266495247” [1] “objective:-3770.81212407217” [1] “objective:-3770.81212264564”

  • Formatting susieR results
  • Identifying Credible Set…

— ERROR — ****** Could NOT identify credible set. Default to SNPs with the top 5 PIPs ****** + Fine-mapping with ‘susie’ completed in 9 seconds.

LRRK2 fine-mapped in 15.73 seconds

MESA - HIS

Used the Ad Mixed American (AMR) reference panel given the absence of a Hispanic reference panel for 1KG_Phase1.

dataset_name <- "MESA_HIS" 

allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  
                                                superpopulation = "HIS",
                                                dataset_name = dataset_name, 
                                                dataset_type = "eQTL",
                                                top_SNPs = Nalls_top_SNPs, 
                                                query_by = "gene",
                   # fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   fullSS_path = "Data/eQTL/MESA_HIS/LRRK2_HIS_MESA.txt",
                   subset_path = "auto",
                   chrom_col = "chr", position_col = "pos_snps", snp_col="snps",
                   pval_col="pvalue", effect_col="beta", gene_col="gene_name", 
                   stderr_col = "calculate",  tstat_col = "statistic",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = F,
                   LD_block = F)

LRRK2

Results path =

  • Extracting subset of data… Subset file already exists. Importing Data/eQTL/MESA_HIS/LRRK2/LRRK2_MESA_HIS_subset.txt …

Subset file looks good! :) + Computing LD matrix… LD Reference Panel = 1KG_Phase1 Creating ‘../1000_Genomes_VCFs’ directory. Identified matching VCF subset file. Importing… ../1000_Genomes_VCFs/Phase1/LRRK2_subset.vcf

—— Importing VCF as bed file… ——

++++++++++ Reading in BIM file… ++++++++++

++++++++++ Calculating LD ++++++++++

++++++++++ LD matrix calculated in 0.96 seconds. ++++++++++ Error in subset.default(subset_DT, leadSNP == T) : object ‘leadSNP’ not found

  • Fine-mapping with SusieR… [1] “objective:-4704.09026794048” [1] “objective:-4703.84677509606” [1] “objective:-4703.8464861313” [1] “objective:-4703.84648578904”

  • Formatting susieR results
  • Identifying Credible Set…

****** 19 SNPs included in Credible Set ****** + Fine-mapping with ‘susie’ completed in 16 seconds.

LRRK2 fine-mapped in 22.12 seconds

Fairfax (2014)

  • Use the Nalls et al. (2019) LRRK2 locus to query for SNPs in the eQTL data.
  • SS Files are SPACE-delimited (not tab-delimited)
# +++++++++++++++ CD14 Stimulation +++++++++++++++ #
dataset_name <- "Fairfax_2014_CD14"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  
                                                superpopulation = "CAU",
                                                dataset_name = dataset_name, 
                                                dataset_type = "eQTL",
                                                top_SNPs = Nalls_top_SNPs, 
                                                query_by = "probes",
                                                probe_path = "./Data/eQTL/gene.ILMN.map",
                   file_sep = " ",                            
                   # fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   fullSS_path = "./Data/eQTL/Fairfax_2014_CD14/LRRK2_Fairfax_CD14.txt",
                   subset_path = "auto",
                   chrom_col = "CHR", position_col = "POS", snp_col="SNP",
                   pval_col="p-value", effect_col="beta", gene_col="gene", 
                   stderr_col = "calculate",  tstat_col = "t-stat",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = force_new_subset,
                   LD_block = T)


# +++++++++++++++ IFN Stimulation +++++++++++++++ #
dataset_name <- "Fairfax_2014_IFN"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  superpopulation = "CAU",
                                                top_SNPs = Nalls_top_SNPs, query_by = "coordinates_merged",
                   file_sep = " ",                             
                   fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   subset_path = "Data/eQTL/Fairfax/LRRK2_Fairfax_IFN_subset.txt",
                   chrom_col = "SNP", position_col = "SNP", snp_col="SNP",
                   pval_col="p-value", effect_col="beta", gene_col="gene", 
                   stderr_col = "calculate",  tstat_col = "t-stat",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = force_new_subset,
                   LD_block = T)

# +++++++++++++++ LPS Stimulation (after 2 hours) +++++++++++++++ #
dataset_name <- "Fairfax_2014_LPS2"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  superpopulation = "CAU",
                                                top_SNPs = Nalls_top_SNPs, 
                                                # query_by = "probes", # Use if querying the main file
                                                query_by = "coordinates_merged", 
                   file_sep = " ",                              
                   fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   subset_path = "Data/eQTL/Fairfax/LRRK2_Fairfax_LPS2_subset.txt",
                   chrom_col = "SNP", position_col = "SNP", snp_col="SNP",
                   pval_col="p-value", effect_col="beta", gene_col="gene", 
                   stderr_col = "calculate",  tstat_col = "t-stat",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = force_new_subset,
                   LD_block = T)


# +++++++++++++++ LPS Stimulation (after 24 hours) +++++++++++++++ #
dataset_name <- "Fairfax_2014_LPS24"
allResults[[dataset_name]] <- finemap_geneList( geneList=c("LRRK2"),  superpopulation = "CAU",
                                                top_SNPs = Nalls_top_SNPs, query_by = "coordinates_merged",
                   file_sep = " ",                             
                   fullSS_path = Directory_info(dataset_name, "fullSumStats"),
                   subset_path = "Data/eQTL/Fairfax/LRRK2_Fairfax_LPS24_subset.txt",
                   chrom_col = "SNP", position_col = "SNP", snp_col="SNP",
                   pval_col="p-value", effect_col="beta", gene_col="gene", 
                   stderr_col = "calculate",  tstat_col = "t-stat",
                   vcf_folder = vcf_folder, num_causal = 1, 
                   force_new_subset = force_new_subset,
                   LD_block = T)

Merge Results

merged_results <- merge_finemapping_results(allResults, csv_path ="Data/merged_finemapping_results.csv")
createDT(merged_results, caption = "Finemapped Credible Sets for Multiple Datasets")
# table(as.character(merged_results$SNP))